Name: Zun Wang

Student ID: 915109847

Assignment 4: Rice SNPs and GWAS

Remember to include the relevant code from the lab page in this file so that this file will knit.

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0     ✓ purrr   0.3.3
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Exercise 0: Be sure that your final knitted file has good formatting. Make sure that you are using informative variable names.

data.geno <- read_csv("../input/Rice_44K_genotypes.csv.gz",
                      na=c("NA","00"))
## Warning: Missing column names filled in: 'X1' [1]
## Warning: Duplicated column names deduplicated: '6_17160794' =>
## '6_17160794_1' [22253]
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
data.geno <- data.geno %>% select(-`6_17160794_1`)
head(data.geno[,1:10]) #first six rows of first 10 columns
summary(data.geno[,1:10]) #summarizes the first 10 columns
##       X1              1_13147            1_73192            1_74969         
##  Length:413         Length:413         Length:413         Length:413        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    1_75852            1_75953            1_91016            1_146625        
##  Length:413         Length:413         Length:413         Length:413        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##    1_149005           1_149754        
##  Length:413         Length:413        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character
data.geno <- data.geno %>% rename(ID=X1)

head(data.geno[,1:10])

Exercise 1: Create a data subset that contains a random sample of 10000 SNPs from the full data set. Place the smaller data set in an object called data.geno.10000. Very important: you want to keep the first column, the one with the variety IDs, and you want it to be the first column in data.geno.10000. AND You do not want it to show up randomly later on in the data set. Think about how to achieve this.

?append
data.geno.10000 = data.geno[,append(1, sample(2:ncol(data.geno),10000))]
data.geno.10000
dim(data.geno.10000)
## [1]   413 10001
colnames(data.geno.10000) %>% str_which("ID")
## [1] 1

Exercise 2: plot the variance explained by the first 10 rows

geno.numeric <- data.geno.10000[,-1] %>% # -1 to remove the first column, with names.
  lapply(factor) %>% # convert characters to "factors", where each category is internally represented as a number
  as.data.frame() %>% # reformat
  data.matrix() #  convert to numeric

head(geno.numeric[,1:10])
##      X5_3681136 X2_32604725 X6_28209149 X1_17506573 X1_1778061 X11_17984086
## [1,]          1           2           2           1          2            1
## [2,]          2           2           1           2          2            2
## [3,]          1           1           2           1          2            2
## [4,]          1          NA           1           1          1            2
## [5,]          2           1           2           1          2            2
## [6,]          1           2           2           1          2            2
##      X11_17628159 X1_35559480 X1_24271205 X12_17683640
## [1,]            3           2           2            1
## [2,]            1           1           1            1
## [3,]            1           1           2            1
## [4,]            1           2           2            1
## [5,]            1           1           2            1
## [6,]            3          NA           2            1
geno.numeric.fill <-
  apply(geno.numeric, 2, function(x) {
    x[is.na(x)] <- mean(x, na.rm=T)
    x})
geno.pca <- prcomp(geno.numeric.fill, 
            rank.=10) # We really only need the first few, this tells prcomp to only return the first 10
str(geno.pca)
## List of 5
##  $ sdev    : num [1:413] 32.26 13.59 13.06 7.42 6.14 ...
##  $ rotation: num [1:10000, 1:10] 0.008683 -0.004918 -0.011412 0.007824 -0.000436 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:10000] "X5_3681136" "X2_32604725" "X6_28209149" "X1_17506573" ...
##   .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:10000] 1.22 1.82 1.62 1.22 1.96 ...
##   ..- attr(*, "names")= chr [1:10000] "X5_3681136" "X2_32604725" "X6_28209149" "X1_17506573" ...
##  $ scale   : logi FALSE
##  $ x       : num [1:413, 1:10] -31.97 47.01 35.87 -5.57 34.14 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:10] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
head(geno.pca$x)[,1:5]
##             PC1        PC2        PC3        PC4         PC5
## [1,] -31.968928  -6.773336  19.284001 -1.0843823  0.84151681
## [2,]  47.011369  17.885011   8.075724  0.7796337 -3.34617048
## [3,]  35.865351 -20.891238  -8.009400 -6.4513608  0.99284666
## [4,]  -5.568475  -7.450594  -4.637975 40.4054200 -2.14945234
## [5,]  34.138704 -20.524952  -7.958786 -6.3324348  2.08317305
## [6,] -19.490962   6.324921 -17.189141 -1.5574399 -0.00198229
head(geno.pca$rotation)[,1:5]
##                        PC1          PC2          PC3           PC4          PC5
## X5_3681136    0.0086825637  0.012398179  0.006975544 -0.0005830784 -0.002973584
## X2_32604725  -0.0049179356  0.020768553  0.007032871 -0.0139483734  0.002404747
## X6_28209149  -0.0114118178 -0.002343160 -0.001407259 -0.0184125518 -0.002290831
## X1_17506573   0.0078240280 -0.003268742  0.000992394 -0.0050330733 -0.002401572
## X1_1778061   -0.0004358688  0.002288342  0.001083413 -0.0145468985  0.002077818
## X11_17984086  0.0088116624 -0.004844917 -0.009708924  0.0127642454  0.029001120
head(geno.pca$sdev)
## [1] 32.257603 13.589552 13.056567  7.424303  6.140059  5.186984
pcvar <- geno.pca$sdev^2 # square std dev to get variance
pcvar.pct <- tibble(pctvar=pcvar/sum(pcvar) * 100,
                    PC=1:length(pcvar))
plotdata <- data.frame(
  name=pcvar.pct$PC[1:10],
  value=pcvar.pct$pctvar[1:10]
)

ggplot(plotdata, aes(x=name, y=value)) + 
  geom_bar(stat = "identity")

PCs <- as_tibble(geno.pca$x) %>% # the principal components
  mutate(ID=data.geno.10000$ID) %>%
  select(ID, everything())
head(PCs)

Exercise 3: Make 2 scatter plots, the first of PC1 vs PC2, and second PC2 vs PC3. Is there any evidence for populations structure (different sub populations)? If so, how many sub populations do you think the MDS plot reveals? What do you make of the individuals that are between the major groups?

ggplot(PCs, aes(x=PC1, y=PC2)) +
  geom_point()

ggplot(PCs, aes(x=PC2, y=PC3)) +
  geom_point()

#There are evidences of subpopulations, and there are 4 subpopulation in total. The individuals between can be the crossed ones between the subgroups.

Exercise 4: * Use the read_csv() head() and summary() functions that you learned earlier to import and look at this file. Import the file into an object called “data.pheno”. * Use a join function to merge the PC genotype data (in the object PCs) with the phenotype data into a new object called “data.pheno.pca”. Use summary and head to look at the new object and make sure that it is as you expect.
* It (data.pheno.pca) should have 413 rows and 49 columns. * Include your code in the .Rmd

data.pheno <- read_csv("../input/RiceDiversity.44K.MSU6.Phenotypes.csv",
                      na=c("NA","00"))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   NSFTVID = col_character(),
##   Accession_Name = col_character(),
##   Country_of_Origin = col_character(),
##   Region = col_character(),
##   `Seed color` = col_character(),
##   `Pericarp color` = col_character()
## )
## See spec(...) for full column specifications.
head(data.pheno[,1:10]) 
summary(data.pheno[,1:10])
##    NSFTVID          Accession_Name     Country_of_Origin     Region         
##  Length:413         Length:413         Length:413         Length:413        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Alu.Tol       Flowering time at Arkansas Flowering time at Faridpur
##  Min.   :0.0300   Min.   : 54.50             Min.   : 39.00            
##  1st Qu.:0.4000   1st Qu.: 79.75             1st Qu.: 66.00            
##  Median :0.6035   Median : 87.71             Median : 74.00            
##  Mean   :0.5874   Mean   : 87.94             Mean   : 71.77            
##  3rd Qu.:0.7500   3rd Qu.: 96.83             3rd Qu.: 78.00            
##  Max.   :1.3500   Max.   :150.50             Max.   :110.00            
##  NA's   :43       NA's   :39                 NA's   :108               
##  Flowering time at Aberdeen FT ratio of Arkansas/Aberdeen
##  Min.   : 45.0              Min.   :0.3724               
##  1st Qu.: 81.5              1st Qu.:0.7975               
##  Median : 99.0              Median :0.8960               
##  Mean   :107.1              Mean   :0.8949               
##  3rd Qu.:114.0              3rd Qu.:1.0195               
##  Max.   :306.0              Max.   :1.7031               
##  NA's   :54                 NA's   :64                   
##  FT ratio of Faridpur/Aberdeen
##  Min.   :0.3459               
##  1st Qu.:0.6838               
##  Median :0.7720               
##  Mean   :0.7711               
##  3rd Qu.:0.8671               
##  Max.   :1.2885               
##  NA's   :109
data.pheno.pca <- inner_join(PCs, data.pheno, by=c("ID"="NSFTVID"))
dim(data.pheno.pca)
## [1] 413  49

Exercise 5: Prepare three different PCA plots to explore if subgroups vary by 1) Amylose content; 2) Pericarp color; 3) Region. That is make a scatter plot of PC1 vs PC2 and color the points by the above characteristics. Do any of these seem to be associated with the different population groups? Briefly discuss. (optionally repeat the plots plotting PC2 vs PC3)

#Amylose
ggplot(data.pheno.pca, aes(x=PC1, y=PC2, color=`Amylose content`)) +
  geom_point()

# Pericapr color
ggplot(data.pheno.pca, aes(x=PC1, y=PC2, color=`Pericarp color`)) +
  geom_point()

# Region
ggplot(data.pheno.pca, aes(x=PC1, y=PC2, color=`Region`)) +
  geom_point()

#The plots show that the two subgroups seperate with distinctive characteristics, with Asian group related to higher amylose and darker pericarp color compared to America/Europe group.

**Exercise 6:** First, use a join function to combine the PCA data (in objectPCs) with the population assignments (infs_results) and place the result ingeno.pca.pop` Then re plot the PCA data, but include the population assignment in an informative way. How do the populations assignments relate to the PCA plot?

data.geno.10000.fs <- matrix("",nrow=nrow(data.geno.10000)*2,ncol=ncol(data.geno.10000)-1+6)

for (i in 1:nrow(data.geno.10000)) {
  data.geno.10000.fs[(i-1)*2+1,1:6] <- data.geno.10000[[i,1]]
  data.geno.10000.fs[(i-1)*2+2,1:6] <- data.geno.10000[[i,1]]
  data.geno.10000.fs[(i-1)*2+1,-1:-6] <- substr(data.geno.10000[i,-1],1,1)
  data.geno.10000.fs[(i-1)*2+2,-1:-6] <- substr(data.geno.10000[i,-1],2,2)
}

data.geno.10000.fs[is.na(data.geno.10000.fs)] <- -9 # fastStructure's code for missing data

dim(data.geno.10000.fs)
## [1]   826 10006
#take a look
data.geno.10000.fs[1:10,1:10]
##       [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7] [,8] [,9]
##  [1,] "NSFTV1" "NSFTV1" "NSFTV1" "NSFTV1" "NSFTV1" "NSFTV1" "G"  "G"  "T" 
##  [2,] "NSFTV1" "NSFTV1" "NSFTV1" "NSFTV1" "NSFTV1" "NSFTV1" "G"  "G"  "T" 
##  [3,] "NSFTV3" "NSFTV3" "NSFTV3" "NSFTV3" "NSFTV3" "NSFTV3" "T"  "G"  "A" 
##  [4,] "NSFTV3" "NSFTV3" "NSFTV3" "NSFTV3" "NSFTV3" "NSFTV3" "T"  "G"  "A" 
##  [5,] "NSFTV4" "NSFTV4" "NSFTV4" "NSFTV4" "NSFTV4" "NSFTV4" "G"  "C"  "T" 
##  [6,] "NSFTV4" "NSFTV4" "NSFTV4" "NSFTV4" "NSFTV4" "NSFTV4" "G"  "C"  "T" 
##  [7,] "NSFTV5" "NSFTV5" "NSFTV5" "NSFTV5" "NSFTV5" "NSFTV5" "G"  "-9" "A" 
##  [8,] "NSFTV5" "NSFTV5" "NSFTV5" "NSFTV5" "NSFTV5" "NSFTV5" "G"  "-9" "A" 
##  [9,] "NSFTV6" "NSFTV6" "NSFTV6" "NSFTV6" "NSFTV6" "NSFTV6" "T"  "C"  "T" 
## [10,] "NSFTV6" "NSFTV6" "NSFTV6" "NSFTV6" "NSFTV6" "NSFTV6" "T"  "C"  "T" 
##       [,10]
##  [1,] "A"  
##  [2,] "A"  
##  [3,] "T"  
##  [4,] "T"  
##  [5,] "A"  
##  [6,] "A"  
##  [7,] "A"  
##  [8,] "A"  
##  [9,] "A"  
## [10,] "A"
write.table(data.geno.10000.fs,file="../output/rice.data.fastStructure.input.str", col.names = FALSE, row.names = FALSE, quote = FALSE)
fam <- tibble(
  FID=data.geno.10000$ID,
  IID=data.geno.10000$ID,
  PID=0,
  MID=0,
  Sex=0,
  Ptype=-9)

head(fam)
bim <- data.geno.10000.fs[,-1:-6]

colnames(bim) <- colnames(data.geno.10000)[-1]

bim[bim=="-9"] <- NA

bim <- apply(bim,2,function(x) unique(na.omit(x))) 

bim[,1:5]
##      5_3681136 2_32604725 6_28209149 1_17506573 1_1778061
## [1,] "G"       "G"        "T"        "A"        "T"      
## [2,] "T"       "C"        "A"        "T"        "G"
bim <- t(bim) %>%  # t transposes the matrix
  as_tibble() %>%
  mutate(SNP_ID=colnames(bim), cM=0) 
## Warning: `as_tibble.matrix()` requires a matrix with column names or a `.name_repair` argument. Using compatibility `.name_repair`.
## This warning is displayed once per session.
head(bim)
bim <- bim %>% 
  separate(SNP_ID,into = c("chromosome","position"),sep="_",remove=FALSE) %>% # create a column for chromosome and position
  select(chromosome, SNP_ID, cM, position, allele1=V1, allele2=V2) # get columns in right order

head(bim)
write.table(bim,file="../output/rice.data.fastStructure.input.bim",col.names = FALSE, row.names = FALSE, quote = FALSE)
fs_results <- read_delim("../output/rice.fastStructure.out.4.meanQ", delim="  ", col_names = FALSE, col_types = 'nnnn')
head(fs_results)
fs_results <- fs_results %>% 
  mutate(ID=data.geno.10000$ID) %>% 
  select(ID, pop1=X1, pop2=X2, pop3=X3, pop4=X4)
head(fs_results)
fs_results$assignedPop <- apply(fs_results[,-1], 1, which.max)
head(fs_results)
table(fs_results$assignedPop)
## 
##   1   2   3   4 
##  95 135 118  65
fs_results$maxPr <- apply(fs_results[,2:5],1,max) 
fs_results <- fs_results %>% 
  arrange(assignedPop,desc(maxPr)) %>%
  mutate(plot.order=row_number())
fs_results_long <- fs_results %>% pivot_longer(pop1:pop4,names_to="population",               values_to="proportion")
head(fs_results_long)
fs_results_long %>%
  ggplot(aes(x=plot.order, y=proportion, color=population, fill=population)) + 
  geom_col()  +
  ylab("geno
       me proportion") + 
  scale_color_brewer(type="div") + scale_fill_brewer(type="div")

fs_results <- fs_results %>% mutate(assignedPop=as.character(assignedPop))
geno.pca.pop <- inner_join(PCs, fs_results, by="ID")
ggplot(geno.pca.pop, aes(x=PC1, y=PC2, color=`assignedPop`)) +
  geom_point()

#The population shows in four assigned groups and align well with the previous PCA subgroups, indicating each subgroup have differenct amylose content and other attributes.
save(data.pheno,geno.pca, PCs, geno.pca.pop,fs_results,file="../output/data_from_SNP_lab.Rdata")